Load required packages
library(Matrix)
library(tidyverse)
library(scales)
library(gridExtra)
library(extrafont)
library(ggrepel)
Read in expression matrix
data_directory <- "../../data/drop-seq/"
expr_readcount_use <-
sparseMatrix(i = readRDS(file.path(data_directory, "expr_readcount_raw_csc_indices.rds")),
p = readRDS(file.path(data_directory, "expr_readcount_raw_csc_indptr.rds")),
x = readRDS(file.path(data_directory, "expr_readcount_raw_csc_values.rds")),
dims = readRDS(file.path(data_directory, "expr_readcount_raw_csc_shape.rds")),
dimnames = readRDS(file.path(data_directory, "expr_readcount_raw_csc_dimnames.rds")),
index1 = FALSE)
dim(expr_readcount_use)
## [1] 27999 27416
Load clustering result
tsne_out_coords <- readRDS(file = file.path(data_directory,
"tsne_out_coords.rds"))
head(tsne_out_coords)
Load gene info
gene_symbols <- read.table(file.path("../..",
"data",
"misc",
"genes.tsv"),
header = FALSE,
row.names = 1,
stringsAsFactors = FALSE)
gene_symbols <- setNames(gene_symbols[[1]], rownames(gene_symbols))
head(gene_symbols)
## ENSMUSG00000051951 ENSMUSG00000089699 ENSMUSG00000102343
## "Xkr4" "Gm1992" "Gm37381"
## ENSMUSG00000025900 ENSMUSG00000109048 ENSMUSG00000025902
## "Rp1" "Rp1" "Sox17"
# Normalize
expr_readcount_norm <-
expr_readcount_use[, Matrix::colSums(expr_readcount_use > 0) >= 200]
expr_readcount_norm <-
expr_readcount_norm[Matrix::rowSums(expr_readcount_norm > 0) >= 30, ]
expr_readcount_norm <-
expr_readcount_norm[Matrix::rowSums(expr_readcount_norm) >= 60, ]
expr_readcount_norm@x <- median(Matrix::colSums(expr_readcount_norm)) *
(expr_readcount_norm@x / rep.int(Matrix::colSums(expr_readcount_norm),
diff(expr_readcount_norm@p)))
Prepare cluster labels
cluster_labels <- tsne_out_coords %>%
group_by(cluster) %>% summarise(x = mean(x),
y = mean(y)) %>%
as.data.frame
# adjust several label positions
cluster_labels[13,] <- c(13, -2.5, -60)
cluster_labels[19,] <- c(19, 0.4228625, 27)
cluster_labels[20,] <- c(20, -52.5, -38.5)
cluster_labels[21,] <- c(21, -5.273788, 0)
Preview clustering
(p <-
ggplot(tsne_out_coords, aes(x, y,
color = cluster)) +
geom_point(size = .6, stroke = 0, shape = 16) +
annotate("text",
family = "Arial",
x = cluster_labels[, "x"],
y = cluster_labels[, "y"], label = cluster_labels[, 1],
size = 2,
color = "black") +
theme_void() +
guides(color = FALSE))
Calculate CPM
expr_cpm_use <- expr_readcount_use
expr_cpm_use@x <- 1000000 *
(expr_cpm_use@x / rep.int(Matrix::colSums(expr_cpm_use),
diff(expr_cpm_use@p)))
dim(expr_cpm_use)
## [1] 27999 27416
Generate violin plots of Tnnt2, Myl2, Sln, Col1a1, Gata6 and Mef2c
# extract colors from the initial plot to keep colors consistent
p_built <- ggplot_build(p)
p_built_colors <- p_built$data[[1]]
rownames(p_built_colors) <- rownames(tsne_out_coords)
cluster_colors <- data.frame(color = p_built_colors["colour"],
cluster = tsne_out_coords[rownames(p_built_colors),
"cluster"])
cluster_colors <- unique(cluster_colors)
rownames(cluster_colors) <- NULL
# order clusters
clusters_selected <- c(19, 18,
9, 6, 11, 4, 8, 21,
3, 1, 2, 10, 17, 22, 20,
13, 7, 14, 12, 15, 5, 16)
# select genes
genes_selected <- c("ENSMUSG00000026414",
"ENSMUSG00000013936",
"ENSMUSG00000042045",
"ENSMUSG00000001506",
"ENSMUSG00000005836",
"ENSMUSG00000005583")
# prepare data
expr_violins <-
do.call(rbind.data.frame,
lapply(clusters_selected, function(x) {
cells_in_selected_cluster <-
rownames(tsne_out_coords[tsne_out_coords$cluster %in% x,])
expr <-
expr_cpm_use[genes_selected,
cells_in_selected_cluster] %>%
as.matrix %>%
as.data.frame %>%
mutate(gene = rownames(.)) %>%
gather(key = "cell",
value = "expr",
- gene)
expr$cluster <- x
return(expr)
}))
# plot
expr_violins %>%
mutate(symbol = factor(gene_symbols[as.character(gene)],
levels = gene_symbols[genes_selected]),
cluster = factor(cluster,
levels = rev(clusters_selected))) %>%
ggplot(aes(cluster, log10(expr + 1),
fill = cluster,
color = cluster)) +
geom_violin(trim = TRUE, lwd = .3) +
theme_bw() +
coord_flip() +
stat_summary(fun.y = median, geom = "point", color = "black", size = 1) +
facet_wrap( ~ symbol, nrow = 1) +
scale_fill_manual(values = rev(cluster_colors[match(clusters_selected,
cluster_colors$cluster),
"colour"])) +
scale_color_manual(values = rev(cluster_colors[match(clusters_selected,
cluster_colors$cluster),
"colour"])) +
scale_y_continuous(name = expression("log"[10]*"(CPM + 1)"),
breaks = c(0, 5)) +
scale_x_discrete(name = "Cluster") +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank()) +
guides(fill = FALSE,
color = FALSE) +
theme(axis.text = element_text(family = "Arial", size = 7),
axis.title = element_text(family = "Arial", size = 8),
axis.title.y = element_text(margin = margin(t = 0, r = 0,
b = 0, l = 0,
unit = "mm")),
strip.text.x = element_text(family = "Arial", size = 8),
legend.title = element_text(family = "Arial", size = 6),
legend.text = element_text(family = "Arial", size = 6),
plot.title = element_text(family = "Arial",
size = 6,
hjust = .5,
margin = margin(t = 0, r = 0,
b = 0, l = 0,
unit = "mm")),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Format plot
# extract colors from the initial plot
p_built <- ggplot_build(p)
p_built_colors <- p_built$data[[1]]
rownames(p_built_colors) <- rownames(tsne_out_coords)
# get cell groups
cells_reprogr <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(category %in% c("JD168")) %>% .$cell
cells_uninfected <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(category %in% c("JD174")) %>% .$cell
cells_primary <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(! category %in% c("JD168", "JD174")) %>% .$cell
# assign colors
color_info <- p_built_colors
color_info[cells_reprogr, "colour"] <- "#FF5722"
color_info[cells_uninfected, "colour"] <- "#FF5722"
cell_colors <- color_info$colour
names(cell_colors) <- rownames(tsne_out_coords)
cell_colors_uniq <- unique(cell_colors)
names(cell_colors_uniq) <- cell_colors_uniq
tsne_out_coords$color <- cell_colors
Generate plot
cell_type_segments <-
data.frame(x = c(-40, -40, -40, -40, -69, -35, 40, 40, 40, 65, 28, -12, -57.5, -57.5, -57.5),
y = c(60, 60, 60, 60, 20, -55, -70, -70, -70, 50, 55, 65, -52.5, -52.5, -52.5),
xend = c(-30, -44, -60, -30, -69, -35, 32, 0, 40, 65, 35, -20, -17.5, -53, -37.5),
yend = c(35, 15, -5, -22, 35, -65, -58, -66, -45, 36, 60, 70, -25, -45, -47.5),
cluster = c(4, 6, 9, 11, 8, 12, 7, 13, 14, 16, 5, 15, 2, 20, 12))
cell_type_labels <-
data.frame(x = c(-40, -61.5, -43, 52, 56, 46, -22, -58),
y = c(65, 45, -70, -70, 55, 64, 75, -58),
label = c("Atrial myocytes",
"Ventricular\nmyocytes",
"Epicardial cells",
"Fibroblasts",
"Lymphocytes",
"Endothelial cells",
"Macrophages",
"MEF-derived"))
p_tsne_color_primary <-
ggplot() +
geom_point(data = tsne_out_coords, aes(x, y,
color = color),
size = .6, stroke = 0, shape = 16) +
scale_color_manual(values = cell_colors_uniq) +
theme_void() +
annotate("text",
family = "Arial",
x = cluster_labels[, "x"],
y = cluster_labels[, "y"], label = cluster_labels[, 1],
parse = TRUE,
size = 2,
color = c("black")) +
guides(color = FALSE,
shape = FALSE) +
geom_segment(data = cell_type_segments,
aes(x = x, xend = xend, y = y, yend = yend),
color = "grey50",
size = .2) +
geom_text(data = cell_type_labels,
aes(x, y, label = label),
color = c(rep("black", 2), "#00BFC4", rep("black", 4), "#FF5722"),
size = 2.8,
family = "Arial")
p_tsne_color_primary
# prepare data
cell_distribution_primary <-
tsne_out_coords[cells_primary,] %>%
group_by(cluster) %>% summarise(primary.count = n())
cell_distribution_reprogr <-
tsne_out_coords[cells_reprogr,] %>%
group_by(cluster) %>% summarise(reprogr.count = n())
cell_distribution_uninfected <-
tsne_out_coords[cells_uninfected,] %>%
group_by(cluster) %>% summarise(uninfected.count = n())
# order clusters
clusters_selected <- c(19, 18,
9, 6, 11, 4, 8, 21,
3, 1, 2, 10, 17, 22, 20,
13, 7, 14, 12, 15, 5, 16)
# plot
full_join(cell_distribution_primary, cell_distribution_reprogr,
by = "cluster") %>%
full_join(cell_distribution_uninfected, by = "cluster") %>%
select(primary.count : uninfected.count) %>%
mutate(primary.count = replace_na(primary.count, 0),
reprogr.count = replace_na(reprogr.count, 0),
uninfected.count = replace_na(uninfected.count, 0)) %>%
`/`(rowSums(.)) %>%
as.data.frame %>%
gather(key = category,
value = ratio) %>%
mutate(cluster = factor(rep(seq_len(length(levels(tsne_out_coords$cluster))),
3),
levels = rev(clusters_selected))) %>%
ggplot() +
geom_bar(aes(x = cluster, y = ratio, fill = category), stat = "identity") +
theme_light() +
scale_x_discrete(name = NULL,
breaks = seq_len(length(levels(tsne_out_coords$cluster)))) +
scale_y_continuous(name = "Percentage",
breaks = seq(0, 1, .2),
labels = scales::percent) +
coord_flip() +
theme(panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "right",
legend.direction ="vertical",
legend.title = element_blank(),
axis.text = element_text(family = "Arial", size = 7),
axis.title = element_text(family = "Arial", size = 8),
strip.text.x = element_text(family = "Arial", size = 8),
legend.text = element_text(family = "Arial", size = 7)) +
scale_fill_manual(name = NULL,
values = c("#00AFBB", "#8BC34A", "#E7B800"),
labels = c("Primary", "Reprogr", "Uninfected"))
Calculate CPM, if it has not been calculated already
if (! exists("expr_cpm_use")) {
expr_cpm_use <- expr_readcount_use
expr_cpm_use@x <- 1000000 *
(expr_cpm_use@x / rep.int(Matrix::colSums(expr_cpm_use),
diff(expr_cpm_use@p)))
}
dim(expr_cpm_use)
## [1] 27999 27416
grep("Myod1", gene_symbols, ignore.case = TRUE, value = TRUE)
## ENSMUSG00000009471
## "Myod1"
selected_gene <- "ENSMUSG00000009471"
p_tsne_Myod1 <-
tsne_out_coords %>%
mutate(cell = rownames(.),
expr = log10(expr_cpm_use[selected_gene, cell] + 1)) %>%
arrange(expr) %>%
ggplot(aes(x, y,
color = expr)) +
geom_point(size = .4, stroke = 0, shape = 16) +
scale_color_viridis_c(name = NULL) +
theme_void() +
theme(legend.justification = c(1, 0),
# legend.position = c(.28, .75),
legend.position = c(.92, .08),
legend.text = element_text(family = "Arial",
size = 4,
margin = margin(t = 0, r = 0,
b = 0, l = -1.8,
unit = "mm")),
legend.key.size = unit(1.5, "mm"),
legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
annotate("text",
x = -65,
# x = -72,
y = 70,
family = "Arial",
label = "Myod1\n",
color = "black",
size = 2.8,
vjust = "inward", hjust = "inward")
grep("Wt1", gene_symbols, ignore.case = TRUE, value = TRUE)
## ENSMUSG00000052748 ENSMUSG00000074987 ENSMUSG00000016458
## "Swt1" "Wt1os" "Wt1"
selected_gene <- "ENSMUSG00000016458"
p_tsne_Wt1 <-
tsne_out_coords %>%
mutate(cell = rownames(.),
expr = log10(expr_cpm_use[selected_gene, cell] + 1)) %>%
arrange(expr) %>%
ggplot(aes(x, y,
color = expr)) +
geom_point(size = .4, stroke = 0, shape = 16) +
scale_color_viridis_c(name = NULL) +
theme_void() +
theme(legend.justification = c(1, 0),
# legend.position = c(.28, .75),
legend.position = c(.92, .08),
legend.text = element_text(family = "Arial",
size = 4,
margin = margin(t = 0, r = 0,
b = 0, l = -1.8,
unit = "mm")),
legend.key.size = unit(1.5, "mm"),
legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm")) +
annotate("text",
x = -65,
# x = -72,
y = 70,
family = "Arial",
label = "Wt1\n",
color = "black",
size = 2.8,
vjust = "inward", hjust = "inward")
p_combined <- grid.arrange(p_tsne_Myod1,
p_tsne_Wt1, nrow = 1)
Define function for data preparation
prepare_lollipop_data <- function(genes,
cells,
expr_cpm,
tolerance = 1e-3) {
# cells: list
expr_cpm_selected <-
expr_cpm[genes, unlist(cells)]
expr_pct <-
lapply(seq_len(length(cells)), function(x) {
cells_in_selected_group <- cells[[x]]
sapply(genes, function(y) {
1 - mean(mapply(FUN = all.equal,
tolerance = tolerance,
expr_cpm_selected[y,
cells_in_selected_group],
0) == TRUE)
})
})
expr_avg <-
lapply(seq_len(length(cells)), function(x) {
cells_in_selected_group <- cells[[x]]
Matrix::rowMeans(expr_cpm_selected[, cells_in_selected_group])
})
expr_pct <- do.call(cbind.data.frame, expr_pct)
expr_avg <- do.call(cbind.data.frame, expr_avg)
colnames(expr_pct) <- seq_len(length(cells))
colnames(expr_avg) <-seq_len(length(cells))
expr_pct$gene <- rownames(expr_pct)
expr_avg$gene <- rownames(expr_avg)
expr_pct_long <-
gather(expr_pct,
key = group,
value = expr.pct,
seq_len(ncol(expr_pct) - 1))
expr_avg_long <-
gather(expr_avg,
key = cluster,
value = expr.avg,
seq_len(ncol(expr_avg) - 1))
expr_pct_avg <- cbind(expr_pct_long,
expr_avg_long)
expr_pct_avg[, c(4, 5)] <- NULL
expr_pct_avg$gene <- factor(expr_pct_avg$gene,
levels = genes)
return(expr_pct_avg)
}
# get cell groups
cells_cluster20_reprogr <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(cluster == 20 & category %in% ("JD168")) %>% .$cell
cells_cluster20_reprogr_other <-
cells_reprogr[! cells_reprogr %in% cells_cluster20_reprogr]
cells_selected <- list(cells_cluster20_reprogr,
cells_cluster20_reprogr_other,
cells_uninfected)
# select genes
genes_selected <- c("ENSMUSG00000009471",
"ENSMUSG00000031972",
"ENSMUSG00000017300")
# summarize expr info
lollipop_data_use <-
prepare_lollipop_data(genes = genes_selected,
cells = cells_selected,
expr_cpm = expr_cpm_use)
# plot
lollipop_data_use %>%
mutate(symbol = gene_symbols[as.character(gene)],
expr.avg.log10 = log10(expr.avg + 1)) %>%
filter(symbol %in% c("Myod1")) %>%
ggplot(aes(group, expr.avg.log10, fill = group)) +
geom_bar(stat = "identity", position ="dodge") +
scale_fill_manual(name = NULL,
values = c("#FF5722",
"grey35",
"grey35")) +
facet_wrap(~ symbol) +
theme_bw() +
scale_y_continuous(name = expression(paste("Avg expr (log"[10], " cpm)")),
breaks = 0:3,
labels = scales::math_format(10 ^ .x)) +
scale_x_discrete(name = NULL,
labels = c("Cluster 20",
"Other",
"Uninfected")) +
theme(axis.text = element_text(family = "Arial", size = 7),
axis.title = element_text(family = "Arial", size = 8),
axis.text.x = element_text(angle = 90,
vjust = .5,
hjust = 1),
axis.title.y = element_text(margin = margin(t = 0, r = 0,
b = 0, l = 0,
unit = "mm")),
strip.text.x = element_text(family = "Arial", size = 8),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
guides(fill = FALSE)
# get cell groups
cells_cluster12 <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(cluster == 12) %>% .$cell
cells_cluster12_primary <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(cluster == 12 & ! category %in% c("JD168", "JD174")) %>% .$cell
cells_cluster12_reprogr <-
tsne_out_coords %>% mutate(cell = rownames(.)) %>%
filter(cluster == 12 & category %in% c("JD168")) %>% .$cell
cells_cluster12_reprogr_other <-
cells_reprogr[! cells_reprogr %in% cells_cluster12_reprogr]
cells_selected <- list(cells_cluster12_primary,
cells_cluster12_reprogr,
cells_cluster12_reprogr_other,
cells_uninfected)
# select genes
genes_selected <- c("ENSMUSG00000016458",
"ENSMUSG00000025105")
# summarize expr info
lollipop_data_use <-
prepare_lollipop_data(genes = genes_selected,
cells = cells_selected,
expr_cpm = expr_cpm_use)
# plot
lollipop_data_use %>%
mutate(symbol = factor(gene_symbols[as.character(gene)],
levels = c("Wt1", "Bnc1")),
expr.avg.log10 = log10(expr.avg + 1)) %>%
ggplot(aes(group, expr.avg.log10, fill = group)) +
geom_bar(stat = "identity", position ="dodge") +
scale_fill_manual(name = NULL,
values = c("#00BFC4",
"#FF5722",
"grey35",
"grey35")) +
facet_wrap(~ symbol) +
theme_bw() +
scale_y_continuous(name = expression(paste("Avg expr (log"[10], " cpm)")),
breaks = 0:3,
labels = scales::math_format(10 ^ .x)) +
scale_x_discrete(name = NULL,
labels = c("Primary epi.",
"Cluster 12",
"Other",
"Uninfected")) +
theme(axis.text = element_text(family = "Arial", size = 7),
axis.title = element_text(family = "Arial", size = 8),
axis.text.x = element_text(angle = 90,
vjust = .5,
hjust = 1),
axis.title.y = element_text(margin = margin(t = 0, r = 0,
b = 0, l = 0,
unit = "mm")),
strip.text.x = element_text(family = "Arial", size = 8),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
guides(fill = FALSE)
Define function
run_binomial_test <- function(expr,
expr_cpm = NULL,
cell_group_a,
cell_group_b = NULL,
log2_effect_size_cutoff = 1,
positive_frac_cutoff = .2,
p.adj_method = "BH") {
if (is.null(cell_group_b)) {
cell_group_b <- colnames(expr)[!colnames(expr) %in% cell_group_a]
cell_group_b_label <- "rest"
message("cells in group a vs the rest")
}
m <- Matrix::rowSums(expr[, cell_group_b] > 0)
m1 <- m
m1[m == 0] <- 1
n <- Matrix::rowSums(expr[, cell_group_a] > 0)
pv1 <- pbinom(n, length(cell_group_a), m1 / length(cell_group_b),
lower.tail = FALSE) + dbinom(n, length(cell_group_a),
m1 / length(cell_group_b))
expressed_ratio_log_fold <-
log(n * length(cell_group_b) / (m * length(cell_group_a)), base = 2)
d1 <- data.frame(log2.effect = expressed_ratio_log_fold,
pval = pv1)
d1 <- d1[d1$log2.effect >= log2_effect_size_cutoff, ]
d1 <- d1[complete.cases(d1), ]
n1 <- n
n1[n == 0] <- 1
pv2 <- pbinom(m, length(cell_group_b), n1 / length(cell_group_a),
lower.tail = FALSE) + dbinom(m, length(cell_group_b),
n1 / length(cell_group_a))
d2 <- data.frame(log2.effect = expressed_ratio_log_fold,
pval = pv2)
d2 <- d2[d2$log2.effect <= -log2_effect_size_cutoff, ]
d2 <- d2[complete.cases(d2), ]
d <- rbind(d1, d2)
positive_frac_a <- round(
Matrix::rowSums(expr[rownames(d),
cell_group_a] > 0) / ncol(expr[rownames(d),
cell_group_a]), 3)
positive_frac_b <- round(
Matrix::rowSums(expr[rownames(d),
cell_group_b] > 0) / ncol(expr[rownames(d),
cell_group_b]), 3)
d$pos.frac.a <- positive_frac_a
d$pos.frac.b <- positive_frac_b
transcripts_use <-
(positive_frac_a >=
positive_frac_cutoff) | (positive_frac_b >=
positive_frac_cutoff)
d <- d[transcripts_use, ]
d <- d[order(abs(d$log2.effect), decreasing = TRUE), ]
d$norm.reads.mean.a <- Matrix::rowMeans(expr[rownames(d), cell_group_a])
d$norm.reads.mean.b <- Matrix::rowMeans(expr[rownames(d), cell_group_b])
d$norm.reads.log2.ratio <-
log((d$norm.reads.mean.a + .01)/ (d$norm.reads.mean.b + .01),
base = 2)
if (! is.null(expr_cpm)) {
expr_cpm_use <- expr_cpm[rownames(d), ]
d$cpm.meam.a <- Matrix::rowMeans(expr_cpm_use[, cell_group_a])
d$cpm.meam.b <- Matrix::rowMeans(expr_cpm_use[, cell_group_b])
d$cpm.log2.ratio <-
log((d$cpm.meam.a + .01) / (d$cpm.meam.b + .01), base = 2)
}
d$p.adj = p.adjust(d$pval, method = p.adj_method)
return(d)
}
library(topGO)
packageVersion("topGO")
## [1] '2.34.0'
Enrich GO terms of highly expressed genes in cluster 20
de_paired <- run_binomial_test(expr = expr_readcount_norm,
expr_cpm = expr_cpm_use,
cell_group_a = cells_cluster20_reprogr,
cell_group_b = cells_cluster20_reprogr_other,
log2_effect_size_cutoff = 1,
positive_frac_cutoff = .2,
p.adj_method = "BH")
genes_of_interest <- rownames(subset(de_paired, log2.effect > 0))
gene_universe <- rownames(expr_readcount_norm)
genes_formatted <- factor(as.integer(gene_universe %in% genes_of_interest))
names(genes_formatted) <- gene_universe
topgo_data <- new("topGOdata", ontology = "BP",
allGenes = genes_formatted,
annot = annFUN.org,
mapping = "org.Mm.eg.db",
ID = "Ensembl")
topgo_out_classic_fisher <- runTest(topgo_data,
algorithm = "classic",
statistic = "fisher")
num_go_terms <- 15
# prepare data
gentable_use <-
GenTable(topgo_data,
classicFisher = topgo_out_classic_fisher,
topNodes = num_go_terms) %>%
mutate(classicFisher = as.numeric(str_replace(classicFisher, "< ", "")),
Term = factor(Term(GO.ID),
levels = rev(Term(GO.ID))))
# plot
ggplot(gentable_use, aes(y = -log10(classicFisher), x = Term)) +
geom_bar(stat = "identity", fill = "grey70") + coord_flip() +
labs(x = NULL) +
theme_classic() +
scale_y_continuous(name = expression(paste("-log"[10]," (p-value)")),
breaks = c(0, 30),
labels = scales::math_format(10 ^ .x)) +
annotate("text",
x = seq_len(nrow(gentable_use)),
y = rep(1, nrow(gentable_use)),
label = rev(gentable_use$Term),
# vjust = "inward",
hjust = "inward",
size = 2.5,
family = "Arial") +
theme(axis.text = element_text(family = "Arial", size = 7),
axis.title = element_text(family = "Arial", size = 8),
legend.title = element_text(family = "Arial", size = 8),
legend.text = element_text(family = "Arial", size = 8),
axis.line.y = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
Define function
analyze_gene_enrichment <- function(expr,
cell_group_a,
cell_group_b,
gene_symbols = NULL,
expr_cpm = NULL) {
m <- Matrix::rowSums(expr[, cell_group_b] > 0)
m1 <- m
m1[m == 0] <- 1
n <- Matrix::rowSums(expr[, cell_group_a] > 0)
pv1 <- pbinom(n, length(cell_group_a), m1 / length(cell_group_b),
lower.tail = FALSE) + dbinom(n, length(cell_group_a),
m1 / length(cell_group_b))
expressed_ratio_log_fold <-
log(n * length(cell_group_b) / (m * length(cell_group_a)), base = 2)
d1 <- data.frame(log2.effect = expressed_ratio_log_fold,
pval = pv1)
d1$pos.frac.a <- Matrix::rowMeans(expr[, cell_group_a] > 0)
d1$pos.frac.b <- Matrix::rowMeans(expr[, cell_group_b] > 0)
if (! is.null(gene_symbols)) {
d1$symbol <- gene_symbols[rownames(d1)]
}
if (! is.null(expr_cpm)) {
d1$mean.cpm.a <- Matrix::rowMeans(expr_cpm[rownames(d1), cell_group_a])
d1$mean.cpm.b <- Matrix::rowMeans(expr_cpm[rownames(d1), cell_group_b])
}
d1$p.adj <- p.adjust(d1$pval, method = "BH")
return(d1)
}
# select reprogrammed cell clusters
clusters_selected_reprogr <- c(1, 2, 3, 10, 12, 17, 20, 22)
genes_selected_43 <- c("ENSMUSG00000023067",
"ENSMUSG00000086369",
"ENSMUSG00000042002",
"ENSMUSG00000021944",
"ENSMUSG00000015627",
"ENSMUSG00000005836",
"ENSMUSG00000037335",
"ENSMUSG00000038193",
"ENSMUSG00000028800",
"ENSMUSG00000019777",
"ENSMUSG00000026313",
"ENSMUSG00000040289",
"ENSMUSG00000019789",
"ENSMUSG00000042258",
"ENSMUSG00000063568",
"ENSMUSG00000024063",
"ENSMUSG00000030557",
"ENSMUSG00000079033",
"ENSMUSG00000005583",
"ENSMUSG00000001419",
"ENSMUSG00000020160",
"ENSMUSG00000030544",
"ENSMUSG00000025930",
"ENSMUSG00000048450",
"ENSMUSG00000021469",
"ENSMUSG00000020542",
"ENSMUSG00000009471",
"ENSMUSG00000015579",
"ENSMUSG00000026923",
"ENSMUSG00000030551",
"ENSMUSG00000026565",
"ENSMUSG00000009739",
"ENSMUSG00000001288",
"ENSMUSG00000015846",
"ENSMUSG00000027833",
"ENSMUSG00000024515",
"ENSMUSG00000028949",
"ENSMUSG00000032419",
"ENSMUSG00000000093",
"ENSMUSG00000018604",
"ENSMUSG00000018263",
"ENSMUSG00000020167",
"ENSMUSG00000025860")
summary_enrichment_genes_selected <-
do.call(rbind.data.frame,
lapply(clusters_selected_reprogr, function(x) {
cat("Computing cluster", x, "\n")
cells_in_selected_group <-
rownames(tsne_out_coords)[tsne_out_coords$cluster == x &
tsne_out_coords$category == "JD168"]
cells_in_selected_group_other <-
cells_reprogr[! cells_reprogr %in% cells_in_selected_group]
y <- analyze_gene_enrichment(expr = expr_readcount_norm,
cell_group_a = cells_in_selected_group,
cell_group_b = cells_in_selected_group_other,
gene_symbols = gene_symbols[rownames(expr_readcount_norm)],
expr_cpm_use)[genes_selected_43, ]
y$category <- x
return(y)
}))
## Computing cluster 1
## Computing cluster 2
## Computing cluster 3
## Computing cluster 10
## Computing cluster 12
## Computing cluster 17
## Computing cluster 20
## Computing cluster 22
summary_enrichment_genes_selected_use <-
summary_enrichment_genes_selected %>%
filter(category %in% c(20, 12, 2)) %>%
mutate(category = factor(category,
levels = c(20, 12, 2)))
facet_labels <- paste("Cluster", levels(summary_enrichment_genes_selected_use$category))
names(facet_labels) <- levels(summary_enrichment_genes_selected_use$category)
# plot
ggplot() +
geom_abline(intercept = 0, slope = 1, linetype = 2) +
geom_point(data = summary_enrichment_genes_selected_use,
aes(pos.frac.b,
pos.frac.a,
size = -log10(p.adj),
color = log2.effect),
alpha = .8,
stroke = 0, shape = 16) +
facet_wrap(~ category,
nrow = 1,
labeller = labeller(category = facet_labels)) +
coord_fixed() +
scale_x_continuous(name = "Expr (%, other reprogrammed cells)",
limits = c(0, 1), breaks = seq(0, 1, .2)) +
scale_y_continuous(name = "Expr (%, indicated cluster)",
limits = c(0, 1), breaks = seq(0, 1, .2)) +
scale_color_viridis_c(name = expression(paste("Log"[2], " effect"))) +
theme_bw() +
labs(size = expression(paste("-log"[10], " (p-value)"))) +
theme(axis.text = element_text(family = "Arial", size = 7),
axis.title = element_text(family = "Arial", size = 8),
strip.text.x = element_text(family = "Arial", size = 8),
legend.title = element_text(family = "Arial", size = 8),
legend.text = element_text(family = "Arial", size = 6),
legend.position = "bottom",
legend.box = "horizontal",
legend.key.size = unit(2.5, "mm"),
legend.margin = margin(t = 0, r = 0, b = 0, l = 0, unit = "mm"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
guides(color = guide_colorbar(order = 1),
size = guide_legend(order = 2)) +
geom_text_repel(data = subset(summary_enrichment_genes_selected_use,
symbol %in% c("Myod1", "Mef2b",
"Mef2c", "Hand2", "Gata6")),
aes(pos.frac.b,
pos.frac.a,
label = subset(summary_enrichment_genes_selected_use,
symbol %in% c("Myod1", "Mef2b",
"Mef2c", "Hand2",
"Gata6"))$symbol),
size = 2.5,
family = "Arial",
box.padding = .2,
point.padding = .2,
nudge_y = .15,
arrow = arrow(length = unit(.02, 'npc')),
segment.color = "grey35",
color = "black")
genes_selected_10 <- c("ENSMUSG00000026628",
"ENSMUSG00000016458",
"ENSMUSG00000025105",
"ENSMUSG00000051910",
"ENSMUSG00000045680",
"ENSMUSG00000038193",
"ENSMUSG00000031965",
"ENSMUSG00000032419",
"ENSMUSG00000005836",
"ENSMUSG00000036098")
gene_symbols[genes_selected_10]
## ENSMUSG00000026628 ENSMUSG00000016458 ENSMUSG00000025105
## "Atf3" "Wt1" "Bnc1"
## ENSMUSG00000051910 ENSMUSG00000045680 ENSMUSG00000038193
## "Sox6" "Tcf21" "Hand2"
## ENSMUSG00000031965 ENSMUSG00000032419 ENSMUSG00000005836
## "Tbx20" "Tbx18" "Gata6"
## ENSMUSG00000036098
## "Myrf"
clusters_selected <- levels(tsne_out_coords$cluster)[
! levels(tsne_out_coords$cluster) %in% c(1, 2, 3, 10, 17, 20, 22)]
cells_selected_lollipop <-
lapply(clusters_selected, function(x) {
rownames(tsne_out_coords)[tsne_out_coords$cluster == x
& (! tsne_out_coords$category %in%
c("JD168", "JD174"))]
})
names(cells_selected_lollipop) <- clusters_selected
cells_selected_lollipop$uninfected <- cells_uninfected
expr_pct_avg_lollipop <-
prepare_lollipop_data(genes = genes_selected_10,
cells = cells_selected_lollipop,
expr_cpm = expr_cpm_use,
tolerance = 1e-3)
# set y axis label colors
colors_axis_labels_y <- unique(p_built_colors$colour)
colors_axis_labels_y <- setNames(colors_axis_labels_y, unique(p_built_colors$group))
colors_axis_labels_y <- c(colors_axis_labels_y, uninfected = "grey50")
# set y axis labels
axis_labels_y <- c("Epicardial cell",
"Uninfected MEF",
"CM 2",
"CM 1",
"Atrial CM",
"Atrial CM",
"Atrial CM",
"Atrial CM",
"Ventricular CM",
"CM 3",
"Cardiac fibroblast",
"Cardiac fibroblast",
"Cardiac fibroblast",
"Macrophage",
"Endothelial cell",
"Lymphocyte")
# order y axis
axis_y <- c("12",
"uninfected",
"19",
"18",
"9",
"6",
"11",
"4",
"8",
"21",
"13",
"7",
"14",
"15",
"5",
"16")
expr_pct_avg_lollipop %>%
mutate(group = factor(names(cells_selected_lollipop)[as.integer(group)],
levels = rev(axis_y)),
symbol = gene_symbols[as.character(gene)]) %>%
ggplot(aes(symbol,
group,
size = expr.pct,
color = log(expr.avg + 1, 10))) +
geom_point() +
guides(color = guide_colorbar(order = 1),
size = guide_legend(order = 2)) +
scale_color_viridis_c(name = expression(paste("Avg expr (log"[10], " cpm)")),
limits = c(0, 3)) +
theme_light() +
scale_x_discrete(name = NULL, position = "top") +
scale_y_discrete(name = NULL, label = rev(axis_labels_y)) +
scale_size(name = "% cells expressing gene",
limits = c(0, 1),
breaks = seq(0, 1, .2),
range = c(0, 4)) +
theme(axis.text = element_text(family = "Arial", size = 8),
axis.title = element_text(family = "Arial", size = 6),
axis.text.x = element_text(angle = - 45,
vjust = .5,
hjust = 1),
axis.text.y = element_text(color = colors_axis_labels_y[rev(axis_y)]),
legend.title = element_text(family = "Arial", size = 8),
legend.text = element_text(family = "Arial", size = 8),
legend.position = "bottom",
legend.box = "vertical",
legend.key.size = unit(2.5, "mm"),
legend.margin = margin(t = -2, r = 15, b = 0, l = -5, unit = "mm"))
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin18.2.0 (64-bit)
## Running under: macOS Mojave 10.14.3
##
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.5/lib/libopenblas_haswellp-r0.3.5.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] org.Mm.eg.db_3.7.0 topGO_2.34.0 SparseM_1.77
## [4] GO.db_3.7.0 AnnotationDbi_1.44.0 IRanges_2.16.0
## [7] S4Vectors_0.20.1 Biobase_2.42.0 graph_1.60.0
## [10] BiocGenerics_0.28.0 ggrepel_0.8.0 extrafont_0.17
## [13] gridExtra_2.3 scales_1.0.0.9000 forcats_0.4.0.9000
## [16] stringr_1.4.0.9000 dplyr_0.8.0.9006 purrr_0.3.1
## [19] readr_1.3.1.9000 tidyr_0.8.3.9000 tibble_2.0.1.9001
## [22] ggplot2_3.1.0.9000 tidyverse_1.2.1.9000 Matrix_1.2-15
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.0.9000 bit64_0.9-7 jsonlite_1.6
## [4] viridisLite_0.3.0 modelr_0.1.4 assertthat_0.2.0
## [7] blob_1.1.1 cellranger_1.1.0 yaml_2.2.0
## [10] Rttf2pt1_1.3.7 pillar_1.3.1.9000 RSQLite_2.1.1
## [13] backports_1.1.3 lattice_0.20-38 glue_1.3.0.9000
## [16] extrafontdb_1.0 digest_0.6.18 rvest_0.3.2
## [19] colorspace_1.4-0 htmltools_0.3.6 pkgconfig_2.0.2
## [22] broom_0.5.1.9000 haven_2.1.0 generics_0.0.2
## [25] withr_2.1.2 lazyeval_0.2.1 cli_1.0.1
## [28] magrittr_1.5.0.9000 crayon_1.3.4 readxl_1.3.0.9000
## [31] memoise_1.1.0 evaluate_0.13 fs_1.2.6.9000
## [34] xml2_1.2.0 tools_3.5.2 hms_0.4.2.9001
## [37] matrixStats_0.54.0 munsell_0.5.0 reprex_0.2.1.9000
## [40] compiler_3.5.2 rlang_0.3.1.9000 grid_3.5.2
## [43] rstudioapi_0.9.0 labeling_0.3 rmarkdown_1.11
## [46] gtable_0.2.0 DBI_1.0.0 R6_2.4.0
## [49] lubridate_1.7.4.9000 knitr_1.21 bit_1.1-14
## [52] zeallot_0.1.0 stringi_1.3.1 Rcpp_1.0.0
## [55] vctrs_0.1.0.9002 dbplyr_1.3.0 tidyselect_0.2.5
## [58] xfun_0.5